from spacy.tokenizer import Tokenizer
import spacy

"""
tokenizer初始化
"""


nlp = spacy.load("en_core_web_sm")
lemmatizer = nlp.get_pipe("lemmatizer")
tokenizer = Tokenizer(nlp.vocab)

def tokenize_en(crud_list):
    first_list = []
    res_list = []
    for line in crud_list:
        tokenizer(line)
        first_list.append(line.lower())
    tokenlist = []
    for text in first_list:
        doc = nlp(text)
        tokenline = []
        for token in doc:
            tokenline.append(token.lemma_)
        tokenlist.append(tokenline)

    for line in tokenlist:
        res_list.append(' '.join(line))
    return res_list
